Model Parameterizations

MKTG 585R Quantitative Marketing Pre-PhD Seminar

Covariance Decomposition

Covariance by Components

# Load packages.
library(tidyverse)
library(rethinking)

sigma_alpha <- 1  # Intercept standard deviation.
sigma_beta <- 0.5 # Slope standard deviation.
rho <- -0.7       # Correlation between parameter types.

# Multivariate covariance decomposed into scale and correlation.
sigmas <- c(sigma_alpha, sigma_beta)
Rho <- matrix(c(1, rho, rho, 1), nrow = 2)

# Matrix multiply to get the covariance matrix.
Sigma <- diag(sigmas) %*% Rho %*% diag(sigmas)

quad_form_diag()

diag(sigmas) # Scale
     [,1] [,2]
[1,]    1  0.0
[2,]    0  0.5
Rho          # %*% Correlation Matrix
     [,1] [,2]
[1,]  1.0 -0.7
[2,] -0.7  1.0
diag(sigmas) # %*% Scale
     [,1] [,2]
[1,]    1  0.0
[2,]    0  0.5
Sigma        # = Covariance Matrix
      [,1]  [,2]
[1,]  1.00 -0.35
[2,] -0.35  0.25

Cholesky Decomposition

We can take any square, symmetric matrix R and decompose it into its Cholesky factor L such that:

Rho
     [,1] [,2]
[1,]  1.0 -0.7
[2,] -0.7  1.0
t(chol(Rho)) %*% chol(Rho)
     [,1] [,2]
[1,]  1.0 -0.7
[2,] -0.7  1.0

Centered Multilevel Regression

// Index values, observations, and covariates.
data {
  int<lower = 1> N;                     // Number of observations.
  int<lower = 1> K;                     // Number of groups/clusters.
  int<lower = 1> I;                     // Number of observation-level covariates.
  
  vector[N] y;                          // Vector of observations.
  array[N] int<lower = 1, upper = K> g; // Vector of group assignments.
  matrix[N, I] X;                       // Matrix of observation-level covariates.
}

// Parameters.
parameters {
  vector[I] gamma;                      // Vector of population-level parameters.
  corr_matrix[I] Omega;                 // Population model correlation matrix.
  vector<lower = 0>[I] tau;             // Population model scale parameters. 
  matrix[K, I] Beta;                    // Matrix of observation-level parameters.
  real<lower = 0> sigma;                // Variance of the observation model.
}

// Regression model.
model {
  // Hyperpriors.
  gamma ~ normal(0, 5);
  Omega ~ lkj_corr(4);
  tau ~ normal(0, 5);
  
  // Prior.
  sigma ~ normal(0, 5);

  // Population model.
  for (k in 1:K) {
    Beta[k,] ~ multi_normal(gamma, quad_form_diag(Omega, tau));
  }

  // Observation model.
  for (n in 1:N) {
    y[n] ~ normal(X[n,] * Beta[g[n],]', sigma);
  }
}

// Generate quantities at each draw.
generated quantities {
  // Compute the log-likelihood.
  vector[N] log_lik;
  for (n in 1:N) {
    log_lik[n] = normal_lpdf(y[n] | X[n,] * Beta[g[n],]', sigma);
  }
}

Non-Centered Parameterization

Non-Centered Multilevel Regression

// Index values, observations, and covariates.
data {
  int<lower = 1> N;                     // Number of observations.
  int<lower = 1> K;                     // Number of groups/clusters.
  int<lower = 1> I;                     // Number of observation-level covariates.
  
  vector[N] y;                          // Vector of observations.
  array[N] int<lower = 1, upper = K> g; // Vector of group assignments.
  matrix[N, I] X;                       // Matrix of observation-level covariates.
}

// Parameters.
parameters {
  vector[I] gamma;                      // Vector of population-level parameters.
  cholesky_factor_corr[I] L_Omega;      // Population model correlation matrix Cholesky factor.
  vector<lower = 0>[I] tau;             // Population model scale parameters. 
  matrix[I, K] Delta;                   // Matrix of non-centered observation-level parameters.
  real<lower = 0> sigma;                // Variance of the observation model.
}

// Transformed parameters.
transformed parameters {
  matrix[K, I] V;                       // Matrix of rescaled non-centered observation-level parameters.
  matrix[K, I] Beta;                    // Matrix of observation-level parameters.
  
  // Compute the non-centered parameterization.
  V = (diag_pre_multiply(tau, L_Omega) * Delta)';
  for (i in 1:I) {
    Beta[,i] = gamma[i] + V[,i];
  }
}

// Regression model.
model {
  // Hyperpriors.
  gamma ~ normal(0, 5);
  L_Omega ~ lkj_corr_cholesky(4);
  tau ~ normal(0, 5);
  
  // Prior.
  sigma ~ normal(0, 5);

  // Population model.
  to_vector(Delta) ~ normal(0, 1);

  // Observation model.
  for (n in 1:N) {
    y[n] ~ normal(X[n,] * Beta[g[n],]', sigma);
  }
}

// Generate quantities at each draw.
generated quantities {
  // Compute the log-likelihood.
  vector[N] log_lik;
  for (n in 1:N) {
    log_lik[n] = normal_lpdf(y[n] | X[n,] * Beta[g[n],]', sigma);
  }
  
  // Compute the correlation matrix Omega.
  corr_matrix[I] Omega;
  Omega = multiply_lower_tri_self_transpose(L_Omega);
}

Including Group Covariates

“Unknown” and “Known” Heterogeneity

Having a population model allows us to characterize unknown heterogeneity across the population such that the varying effects are drawn from the given population.

// Population model.
for (k in 1:K) {
  Beta[k,] ~ multi_normal(gamma, quad_form_diag(Omega, tau));
}

If we also have data that might help explain that heterogeneity, we can introduce a second linear model that models known heterogeneity.

// Population model.
for (k in 1:K) {
  Beta[k,] ~ multi_normal(Z[k,] * Gamma, quad_form_diag(Omega, tau));
}

Non-Centered Multilevel Regression with Population-Level Covariates

// Index values, hyperprior values, observations, and covariates.
data {
  int<lower = 1> N;                     // Number of observations.
  int<lower = 1> K;                     // Number of groups/clusters.
  int<lower = 1> I;                     // Number of observation-level covariates.
  int<lower = 1> J;                     // Number of population-level covariates.
  
  real Gamma_mean;                      // Mean of population-level means.
  real<lower = 0> Gamma_scale;          // Scale of population-level means.
  real<lower = 0> Omega_shape;          // Shape of population-level scale.
  real tau_mean;                        // Mean of population-level scale.
  real<lower = 0> tau_scale;            // Scale of population-level scale.
  real sigma_mean;                      // Mean of observation-level variance.
  real<lower = 0> sigma_scale;          // Scale of observation-level variance.
  
  vector[N] y;                          // Vector of observations.
  array[N] int<lower = 1, upper = K> g; // Vector of group assignments.
  matrix[N, I] X;                       // Matrix of observation-level covariates.
  matrix[K, J] Z;                       // Matrix of population-level covariates.
}

// Parameters.
parameters {
  matrix[J, I] Gamma;                   // Vector of population-level parameters.
  cholesky_factor_corr[I] L_Omega;      // Population model correlation matrix Cholesky factor.
  vector<lower = 0>[I] tau;             // Population model scale parameters. 
  matrix[I, K] Delta;                   // Matrix of non-centered observation-level parameters.
  real<lower = 0> sigma;                // Variance of the observation model.
}

// Transformed parameters.
transformed parameters {
  matrix[K, I] V;                       // Matrix of rescaled non-centered observation-level parameters.
  matrix[K, I] Beta;                    // Matrix of observation-level parameters.
  
  // Compute the non-centered parameterization.
  V = (diag_pre_multiply(tau, L_Omega) * Delta)';
  for (i in 1:I) {
    Beta[,i] = Z * Gamma[,i] + V[,i];
  }
}

// Regression model.
model {
  // Hyperpriors.
  to_vector(Gamma) ~ normal(Gamma_mean, Gamma_scale);
  L_Omega ~ lkj_corr_cholesky(Omega_shape);
  tau ~ normal(tau_mean, tau_scale);
  
  // Prior.
  sigma ~ normal(sigma_mean, sigma_scale);

  // Population model.
  to_vector(Delta) ~ normal(0, 1);

  // Observation model.
  for (n in 1:N) {
    y[n] ~ normal(X[n,] * Beta[g[n],]', sigma);
  }
}

// Generate quantities at each draw.
generated quantities {
  // Compute the log-likelihood.
  vector[N] log_lik;
  for (n in 1:N) {
    log_lik[n] = normal_lpdf(y[n] | X[n,] * Beta[g[n],]', sigma);
  }
  
  // Compute the correlation matrix Omega.
  corr_matrix[I] Omega;
  Omega = multiply_lower_tri_self_transpose(L_Omega);
}

Removing the Scaffolding Redux

Stan has a number of awesome resources (including Statistical Rethinking): the Stan User’s Guide (plus the Stan Reference Manual and the Stan Functions Reference).

There are a growing number of supporting packages: {bayesplot}, {posterior}, {loo}, {ggdist}, {tidybayes}, and {brms}.